This project was created for 611 class, fall 2024. The data set I am using is a eye tracking data from 45 healthy adults. During the data collection participants read 6 short texts twice from a given perspective (of a real estate buyer or a baby sitter), then switched their perspective and read the texts again. The data set consists of 39 variables and 76517 observations. My goal is to see the patterns in the data which will help us understand the mechanisms of text processing better. Namely I am interested to look into eye tracking processing data which might predict attention allocation during text rereading.
setwd("~/work")
library(tidyverse);
## -- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
## v dplyr 1.1.4 v readr 2.1.5
## v forcats 1.0.0 v stringr 1.5.1
## v ggplot2 3.5.1 v tibble 3.2.1
## v lubridate 1.9.3 v tidyr 1.3.1
## v purrr 1.0.2
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## i Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2);
cleaned_data <- read.csv ("./cleaned_data2.csv");
SentenceRelevance_labels <- c("0 Neutral", "1 Buyer", "2 Sitter")
SubjectPerspective_labels <- c("Started with Sitter", "Started with Buyer")
CongruencyLevels_labels <- c("Non-congruent", "Neutral", "Congruent")
cleaned_data$CongruencyLevels <- factor(cleaned_data$CongruencyLevels)
Here explain why you do it
numeric_cols <- dplyr::select(cleaned_data, IA_First_Fixation_Time, FFdur, F2dur, FPNfix, GZD, regrdur, TTime, Skip)
numeric_cols_scaled <- scale(numeric_cols)
numeric_cols_scaled_clean <- numeric_cols_scaled[complete.cases(numeric_cols_scaled) &
!apply(numeric_cols_scaled, 1, function(x) any(is.infinite(x))), ]
pca_result <- prcomp(numeric_cols_scaled_clean, center = TRUE, scale. = TRUE)
summary(pca_result)
## Importance of components:
## PC1 PC2 PC3 PC4 PC5 PC6 PC7
## Standard deviation 1.5188 1.1686 1.0537 0.9995 0.9398 0.9136 0.67017
## Proportion of Variance 0.2883 0.1707 0.1388 0.1249 0.1104 0.1043 0.05614
## Cumulative Proportion 0.2883 0.4590 0.5978 0.7227 0.8331 0.9374 0.99355
## PC8
## Standard deviation 0.22710
## Proportion of Variance 0.00645
## Cumulative Proportion 1.00000
pca_result$rotation
## PC1 PC2 PC3 PC4
## IA_First_Fixation_Time 0.01128081 -0.1856810 0.69930259 0.27025185
## FFdur 0.44679376 0.0570400 0.02596974 0.29994887
## F2dur 0.25710100 0.5480737 0.16948165 0.06398679
## FPNfix 0.37824933 -0.5422693 -0.16473380 -0.30448658
## GZD 0.60907500 -0.2549171 -0.08242947 -0.04180739
## regrdur 0.05041756 0.3689164 -0.05419034 -0.72068647
## TTime 0.46553590 0.3889549 0.07089244 0.07904674
## Skip -0.02892479 0.1243391 -0.66308616 0.46130575
## PC5 PC6 PC7 PC8
## IA_First_Fixation_Time -0.051389872 0.63291191 -0.01076196 0.0009454112
## FFdur -0.647853415 -0.19537914 0.27046393 -0.4188558581
## F2dur 0.566801895 -0.00347724 0.50948824 -0.1408588771
## FPNfix 0.343767818 0.17414914 -0.04526499 -0.5408039806
## GZD -0.008680241 0.02503446 0.20717001 0.7154585663
## regrdur -0.361883241 0.44709835 0.09022667 -0.0059937050
## TTime 0.082428880 -0.01286963 -0.78324800 -0.0166934411
## Skip 0.019364539 0.57464790 0.02384443 -0.0072905416
pca_scores <- pca_result$x
cleaned_data_aligned <- cleaned_data[complete.cases(numeric_cols_scaled) &
!apply(numeric_cols_scaled, 1, function(x) any(is.infinite(x))), ]
pca_plot_data <- cbind(pca_scores, Reading = cleaned_data_aligned$Reading)
pca_plot_data <- as.data.frame(pca_plot_data)
plot0 <- biplot(pca_result, scale = 0)
ggsave("biplot.png", plot=plot0)
## Saving 7 x 5 in image
plot <- ggplot(pca_plot_data, aes(x = PC1, y = PC2)) +
geom_point(aes(color = Reading), size = 3) +
labs(
title = "PCA: PC1 vs PC2",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Reading"
) +
theme_minimal()
ggsave("figures/PC1vsPC2.png", plot=plot)
## Saving 7 x 5 in image
plot2 <- ggplot(pca_plot_data, aes(x = PC1, y = PC3)) +
geom_point(aes(color = Reading), size = 3) +
labs(
title = "PCA: PC1 vs PC3",
x = "Principal Component 1",
y = "Principal Component 3",
color = "Reading"
) +
theme_minimal()
ggsave("figures/PC1vsPC3.png", plot=plot2)
## Saving 7 x 5 in image
plot3 <- ggplot(pca_plot_data, aes(x = PC2, y = PC3)) +
geom_point(aes(color = Reading), size = 3) +
labs(
title = "PCA: PC2 vs PC3",
x = "Principal Component 2",
y = "Principal Component 3",
color = "Reading"
) +
theme_minimal()
ggsave("figures/PC2vsPC3.png", plot=plot3)
## Saving 7 x 5 in image
reduced_data <- as.data.frame(pca_scores[, 1:2])
set.seed(123)
# Apply k-means clustering with 3 clusters (adjust k based on your use case)
kmeans_result <- kmeans(reduced_data, centers = 3, nstart = 25)
# Add the cluster assignments to the data
reduced_data$Cluster <- as.factor(kmeans_result$cluster)
# View the cluster centers
kmeans_result$centers
## PC1 PC2
## 1 -0.7840434 0.4310042
## 2 0.4285204 -1.0874878
## 3 3.0017653 0.7794242
##plot
plot4 <- ggplot(reduced_data, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(size = 3) +
labs(
title = "K-means Clustering on Principal Components",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Cluster"
) +
theme_minimal()
ggsave("figures/K-means.png", plot=plot4)
## Saving 7 x 5 in image
Conclusions
This requires further analysis. I’ll map the clusters to the original data, analyze their characteristics, and determine associations with any specific variables.